# coding:utf-8
'''
分词
https://github.com/fxsjy/jieba
'''
import jieba.posseg as pseg
import jieba.analyse
import wordcloud

# 待分词文本
text = "中国文化渊远流长、博大精深、绚烂多彩，是东亚文化圈的文化宗主国，在世界文化体系内占有重要地位，由于各地的地理位置、自然条件的差异，人文、经济方面也各有特点。传统文化艺术形式有诗词、戏曲、书法、国画等，而春节、元宵、清明、端午、中秋、重阳等则是中国重要的传统节日。"
# 用户自定义词语列表
custom_words = ["中国文化"]

# 加载用户自定义词语
for custom_word in custom_words:
    jieba.add_word(custom_word)

# 分词
seg_list = jieba.cut(sentence=text, cut_all=False)
for seg in seg_list:
    print(seg)

print("------------")

# 带词性标注
words = pseg.cut(sentence=text)
for word, flag in words:
    print("%s %s" % (word, flag))

print("------------")

# 关键词分析(TextRank算法)
keywords = jieba.analyse.textrank(sentence=text, topK=10, withWeight=True)
print(keywords)

# 词云
word_cloud = wordcloud.WordCloud(width=800, height=600, font_path="msyh.ttf", background_color="white")
word_list = []
for word, flag in pseg.cut(sentence=text):
    # 只处理名词和动词
    if flag in["n", "nr", "ns", "nt", "nz", "v", "vn"]:
        word_list.append(word)
word_cloud.generate(" ".join(word_list))
word_cloud.to_file("word_cloud.jpg")
    
